import pandas as pd
import xarray as xr
import calendar
import yaml
import osData Cleansing and Records Calculations
Part 2 of 3
This notebook follows sequentially from NOAA-CO-OPS-data in which we downloaded the latest data for a particular NOAA CO-OPS weather and tide station. The data record and corresponding metadata were written to file. Here we use those data and calculates several daily and monthly statistics and records. This is done in two steps:
Filter the data: We do not perform any quality assurance or quality control checks, but we do remove from the records any days missing a specified amount of data and any months missing a specified number of days of data.
Calculate records:
- Daily and monthly averages
- Record high daily and monthly averages*
- Record low daily and monthly averages*
- Average daily and monthly high
- Lowest daily and monthly high*
- Record daily and monthly high*
- Average daily and monthly low
- Highest daily and monthly low*
- Record daily and monthly low*
Years are also noted for those records marked by an asterisk (*).
Packages and configurations
First we import the packages we need.
By default, Python only displays warnings the first time they are thrown. Ideally, we want a code that does not throw any warnings, but it sometimes takes soem trial and error to resolve the issue being warned about. So, for diagnostic purposes, we’ll set the kernel to always display warnings.
import warnings
warnings.filterwarnings('always')Functions
Next, we define a number of functions that will come in handy later.
Helper functions
def camel(text):
"""Convert 'text' to camel case"""
s = text.replace(',','').replace("-", " ").replace("_", " ")
s = s.split()
if len(text) == 0:
return text
return s[0].lower() + ''.join(i.capitalize() for i in s[1:])
def DOY(df):
"""Determine year day out of 366"""
# Day of year as integer
df['YearDay'] = df.index.day_of_year.astype(int)
# Years that are NOT leap years
leapInd = [not calendar.isleap(i) for i in df.index.year]
mask = (leapInd) & (df.index.month > 2)
# Advance by one day everything after February 28
df.loc[mask, 'YearDay'] += 1
return dfFiltering data
def count_missing_hours(group, threshold=3):
"""Return True if the number of hours in a day with missing data is less
than or equal to 'threshold' and False otherwise.
"""
missing_hours = group.resample('1h').mean().isna().sum()
return missing_hours <= threshold
def count_missing_days(group, threshold=2):
"""Return True if the number of days in a month with missing data is less
than or equal to 'theshold' and False otherwise. Two tests are performed:
missing data (NaN) and compare to the number of days in the given month.
"""
try:
days_in_month = pd.Period(group.index[0].strftime(format='%Y-%m-%d')).days_in_month
missing_days = group.resample('1D').mean().isna().sum()
missing_days_flag = missing_days <= threshold
days_in_month_flag = days_in_month - group.resample('1D').mean().size <= threshold
return min(missing_days_flag, days_in_month_flag)
except IndexError:
pass
def filter_data(data, hr_threshold=3, day_threshold=2):
"""Filter data to remove days with more than 'hr_threshold' missing hours
of data and months with more than 'day_threshold' days of missing data.
"""
# Filter out days missing more than <hr_threshold> hours
filtered = data.groupby(pd.Grouper(freq='1D')).filter(lambda x: count_missing_hours(group=x, threshold=hr_threshold))
# Filter out months missing more than <day_threshold> days
filtered = filtered.groupby(pd.Grouper(freq='1M')).filter(lambda x: count_missing_days(group=x, threshold=day_threshold))
return filteredCalculate records
def daily_highs(df):
"""Daily highs"""
return df.groupby(pd.Grouper(freq='1D')).max(numeric_only=True)
def daily_lows(df):
"""Daily lows"""
return df.groupby(pd.Grouper(freq='1D')).min(numeric_only=True)
def daily_avgs(df, decimals=1, true_average=False):
"""Daily averages by calendar day rounded to 'decimals'. If
'true_average' is True, all measurements from each 24-hour day will be
used to calculate the average. Otherwise, only the maximum and minimum
observations are used. Defaults to False (meteorological standard).
"""
if true_average:
results = df.groupby(pd.Grouper(freq='1D')).mean(numeric_only=True)
else:
dailyHighs = daily_highs(df)
dailyLows = daily_lows(df)
results = (dailyHighs + dailyLows) / 2
return results.round(decimals)
def daily_avg(df, decimals=1, true_average=False):
"""Daily averages rounded to 'decimals'. If 'true_average' is True, all
measurements from each 24-hour day will be used to calculate the daily
average. Otherwise, only the maximum and minimum observations are used.
Defaults to False (meteorological standard).
"""
dailyAvgs = daily_avgs(df, decimals=decimals, true_average=true_average)
dailyAvg = dailyAvgs.groupby('YearDay').mean(numeric_only=True).round(decimals)
dailyAvg.index = dailyAvg.index.astype(int)
results = xr.DataArray(dailyAvg, dims=['yearday', 'variable'])
results.name = 'Daily Average'
return results
def monthly_highs(df, decimals=1, true_average=False):
"""Monthly highs rounded to 'decimals'. If 'true_average' is True, all
measurements from each 24-hour day will be used to calculate the daily
average. Otherwise, only the maximum and minimum observations are used.
Defaults to False (meteorological standard).
"""
dailyAvgs = daily_avgs(df, decimals=decimals, true_average=False)
monthHighs = dailyAvgs.groupby(pd.Grouper(freq='M')).max(numeric_only=True)
return monthHighs
def monthly_lows(df, decimals=1, true_average=False):
"""Monthly lows rounded to 'decimals'. If 'true_average' is True, all
measurements from each 24-hour day will be used to calculate the daily
average. Otherwise, only the maximum and minimum observations are used.
Defaults to False (meteorological standard).
"""
dailyAvgs = daily_avgs(df, decimals=decimals, true_average=true_average)
monthLows = dailyAvgs.groupby(pd.Grouper(freq='M')).min(numeric_only=True)
return monthLows
def monthly_avg(df, decimals=1, true_average=False):
"""Monthly averages for variable 'var' rounded to 'decimals'. If
'true_average' is True, all measurements from each 24-hour day will be
used to calculate the daily average. Otherwise, only the maximum and
minimum observations are used. Defaults to False (meteorological
standard).
"""
dailyAvgs = daily_avgs(df, decimals=decimals, true_average=true_average)
monthlyMeans = dailyAvgs.groupby(pd.Grouper(freq='M')).mean(numeric_only=True).round(decimals)
monthlyMeans.drop('YearDay', axis=1, inplace=True)
monthlyAvg = monthlyMeans.groupby(monthlyMeans.index.month).mean(numeric_only=True).round(decimals)
monthlyAvg.index = monthlyAvg.index.astype(int)
results = xr.DataArray(monthlyAvg, dims=['month', 'variable'])
results.name = 'Monthly Average'
return results
def record_high_daily_avg(df, decimals=1, true_average=False):
"""Record high daily averages rounded to 'decimals'. If 'true_average'
is True, all measurements from each 24-hour day will be used to
calculate the daily average. Otherwise, only the maximum and minimum
observations are used. Defaults to False (meteorological standard).
"""
# Calculate the records
dailyAvgs = daily_avgs(df=df, decimals=decimals, true_average=true_average)
recordHighDailyAvg = dailyAvgs.groupby('YearDay').max(numeric_only=True).round(decimals)
recordHighDailyAvg.index = recordHighDailyAvg.index.astype(int)
# Record years
recordHighDailyAvgYear = dailyAvgs.groupby('YearDay').apply(lambda x: x.idxmax(numeric_only=True).dt.year)
recordHighDailyAvgYear.drop('YearDay', axis=1, inplace=True)
recordHighDailyAvgYear.index = recordHighDailyAvgYear.index.astype(int)
recordHighDailyAvgYear.columns = [i+' Year' for i in recordHighDailyAvgYear.columns]
# Create xarray
results = pd.concat((recordHighDailyAvg, recordHighDailyAvgYear), axis=1)
results = xr.DataArray(results, dims=['yearday', 'variable'])
results.name = 'Record High Daily Average'
return results
def record_high_monthly_avg(df, decimals=1, true_average=False):
"""Record high monthly averages rounded to 'decimals'. If
'true_average' is True, all measurements from each 24-hour day will be
used to calculate the daily average. Otherwise, only the maximum and
minimum observations are used. Defaults to False (meteorological
standard).
"""
# Calculate the records
dailyAvgs = daily_avgs(df, decimals=decimals, true_average=true_average)
monthlyAvgs = dailyAvgs.groupby(pd.Grouper(freq='M')).mean(numeric_only=True).round(decimals)
monthlyAvgs.drop('YearDay', axis=1, inplace=True)
recordHighMonthlyAvg = monthlyAvgs.groupby(monthlyAvgs.index.month).max(numeric_only=True)
recordHighMonthlyAvg.index = recordHighMonthlyAvg.index.astype(int)
# Record years
recordHighMonthlyAvgYear = monthlyAvgs.groupby(monthlyAvgs.index.month).apply(lambda x: x.idxmax(numeric_only=True).dt.year)
recordHighMonthlyAvgYear.index = recordHighMonthlyAvgYear.index.astype(int)
recordHighMonthlyAvgYear.columns = [i+' Year' for i in recordHighMonthlyAvgYear.columns]
# Create xarray
results = pd.concat((recordHighMonthlyAvg, recordHighMonthlyAvgYear), axis=1)
results = xr.DataArray(results, dims=['month', 'variable'])
results.name = 'Record High Monthly Average'
return results
def record_low_daily_avg(df, decimals=1, true_average=False):
"""Record low daily averages rounded to 'decimals'. If 'true_average'
True, all measurements from each 24-hour day will be used to calculate
the average. Otherwise, only the maximum and minimum observations are
used. Defaults to False (meteorological standard)."""
# Calculate the records
dailyAvgs = daily_avgs(df=df, decimals=decimals, true_average=true_average)
recordLowDailyAvg = dailyAvgs.groupby('YearDay').min(numeric_only=True).round(decimals)
recordLowDailyAvg.index = recordLowDailyAvg.index.astype(int)
# Record years
recordLowDailyAvgYear = dailyAvgs.groupby('YearDay').apply(lambda x: x.idxmin(numeric_only=True).dt.year)
recordLowDailyAvgYear.drop('YearDay', axis=1, inplace=True)
recordLowDailyAvgYear.index = recordLowDailyAvgYear.index.astype(int)
recordLowDailyAvgYear.columns = [i+' Year' for i in recordLowDailyAvgYear.columns]
# Create xarray
results = pd.concat((recordLowDailyAvg, recordLowDailyAvgYear), axis=1)
results = xr.DataArray(results, dims=['yearday', 'variable'])
results.name = 'Record Low Daily Average'
return results
def record_low_monthly_avg(df, decimals=1, true_average=False):
"""Record low monthly averages rounded to 'decimals'. If 'true_average'
is True, all measurements from each 24-hour day will be used to
calculate the daily average. Otherwise, only the maximum and minimum
observations are used. Defaults to False (meteorological standard).
"""
# Calculate the records
dailyAvgs = daily_avgs(df, decimals=decimals, true_average=true_average)
monthlyAvgs = dailyAvgs.groupby(pd.Grouper(freq='M')).mean(numeric_only=True).round(decimals)
monthlyAvgs.drop('YearDay', axis=1, inplace=True)
recordLowMonthlyAvg = monthlyAvgs.groupby(monthlyAvgs.index.month).min(numeric_only=True)
recordLowMonthlyAvg.index = recordLowMonthlyAvg.index.astype(int)
# Record years
recordLowMonthlyAvgYear = monthlyAvgs.groupby(monthlyAvgs.index.month).apply(lambda x: x.idxmin(numeric_only=True).dt.year)
recordLowMonthlyAvgYear.index = recordLowMonthlyAvgYear.index.astype(int)
recordLowMonthlyAvgYear.columns = [i+' Year' for i in recordLowMonthlyAvgYear.columns]
# Create xarray
results = pd.concat((recordLowMonthlyAvg, recordLowMonthlyAvgYear), axis=1)
results = xr.DataArray(results, dims=['month', 'variable'])
results.name = 'Record Low Monthly Average'
return results
def avg_daily_high(df, decimals=1):
"""Average daily highs rounded to 'decimals'."""
dailyHighs = daily_highs(df)
results = dailyHighs.groupby('YearDay').mean(numeric_only=True).round(decimals)
results = xr.DataArray(results, dims=['yearday', 'variable'])
results.name = 'Average Daily High'
return results
def avg_monthly_high(df, decimals=1, true_average=False):
"""Average monthly highs rounded to 'decimals'. If 'true_average' is
True, all measurements from each 24-hour day will be used to calculate
the daily average. Otherwise, only the maximum and minimum observations
are used. Defaults to False (meteorological standard).
"""
monthlyHighs = monthly_highs(df, decimals=decimals, true_average=true_average)
monthlyHighs.drop('YearDay', axis=1, inplace=True)
avgMonthlyHighs = monthlyHighs.groupby(monthlyHighs.index.month).mean(numeric_only=True).round(decimals)
results = xr.DataArray(avgMonthlyHighs, dims=['month', 'variable'])
results.name = 'Average Monthly High'
return results
def lowest_daily_high(df, decimals=1):
"""Lowest daily highs rounded to 'decimals'."""
# Calculate the record
dailyHighs = daily_highs(df)
lowestHigh = dailyHighs.groupby('YearDay').min(numeric_only=True).round(decimals)
lowestHigh.index = lowestHigh.index.astype(int)
# Record years
lowestHighYear = dailyHighs.groupby('YearDay').apply(lambda x: x.idxmin(numeric_only=True).dt.year)
lowestHighYear.drop('YearDay', axis=1, inplace=True)
lowestHighYear.index = lowestHighYear.index.astype(int)
lowestHighYear.columns = [i+' Year' for i in lowestHighYear.columns]
# Create xarray
results = pd.concat((lowestHigh, lowestHighYear), axis=1)
results = xr.DataArray(results, dims=['yearday', 'variable'])
results.name = 'Lowest Daily High'
return results
def lowest_monthly_high(df, decimals=1, true_average=False):
"""Lowest monthly highs rounded to 'decimals'. If 'true_average' is
True, all measurements from each 24-hour day will be used to calculate
the daily average. Otherwise, only the maximum and minimum observations
are used. Defaults to False (meteorological standard).
"""
# Calculate the record
monthlyHighs = monthly_highs(df, decimals=decimals, true_average=true_average)
monthlyHighs.drop('YearDay', axis=1, inplace=True)
lowMonthlyHigh = monthlyHighs.groupby(monthlyHighs.index.month).min(numeric_only=True).round(decimals)
lowMonthlyHigh.index = lowMonthlyHigh.index.astype(int)
# Record years
lowMonthlyHighYear = monthlyHighs.groupby(monthlyHighs.index.month).apply(lambda x: x.idxmin(numeric_only=True).dt.year)
lowMonthlyHighYear.index = lowMonthlyHighYear.index.astype(int)
lowMonthlyHighYear.columns = [i+' Year' for i in lowMonthlyHighYear.columns]
# Create xarray
results = pd.concat((lowMonthlyHigh, lowMonthlyHighYear), axis=1)
results = xr.DataArray(results, dims=['month', 'variable'])
results.name = 'Lowest Monthly High'
return results
def record_daily_high(df, decimals=1):
"""Record daily highs rounded to 'decimal'."""
# Calculate the record
dailyHighs = daily_highs(df)
recordHigh = dailyHighs.groupby('YearDay').max(numeric_only=True).round(decimals)
recordHigh.index = recordHigh.index.astype(int)
# Record years
recordHighYear = dailyHighs.groupby('YearDay').apply(lambda x: x.idxmax(numeric_only=True).dt.year)
recordHighYear.drop('YearDay', axis=1, inplace=True)
recordHighYear.index = recordHighYear.index.astype(int)
recordHighYear.columns = [i+' Year' for i in recordHighYear.columns]
# Create xarray
results = pd.concat((recordHigh, recordHighYear), axis=1)
results = xr.DataArray(results, dims=['yearday', 'variable'])
results.name = 'Record Daily High'
return results
def record_monthly_high(df, decimals=1, true_average=False):
"""Record monthly highs rounded to 'decimals'. If 'true_average' is
True, all measurements from each 24-hour day will be used to calculate
the daily average. Otherwise, only the maximum and minimum observations
are used. Defaults to False (meteorological standard).
"""
# Calculate the record
monthlyHighs = monthly_highs(df, decimals=decimals, true_average=true_average)
monthlyHighs.drop('YearDay', axis=1, inplace=True)
recordMonthlyHigh = monthlyHighs.groupby(monthlyHighs.index.month).max(numeric_only=True).round(decimals)
recordMonthlyHigh.index = recordMonthlyHigh.index.astype(int)
# Record years
recordMonthlyHighYear = monthlyHighs.groupby(monthlyHighs.index.month).apply(lambda x: x.idxmax(numeric_only=True).dt.year)
recordMonthlyHighYear.index = recordMonthlyHighYear.index.astype(int)
recordMonthlyHighYear.columns = [i+' Year' for i in recordMonthlyHighYear.columns]
# Create xarray
results = pd.concat((recordMonthlyHigh, recordMonthlyHighYear), axis=1)
results = xr.DataArray(results, dims=['month', 'variable'])
results.name = 'Record Monthly High'
return results
def avg_daily_low(df, decimals=1):
"""Average daily lows rounded to 'decimals'."""
dailyLows = daily_lows(df)
results = dailyLows.groupby('YearDay').mean(numeric_only=True).round(decimals)
results = xr.DataArray(results, dims=['yearday', 'variable'])
results.name = 'Average Daily Low'
return results
def avg_monthly_low(df, decimals=1, true_average=False):
"""Average monthly lows rounded to 'decimals'. If 'true_average' is
True, all measurements from each 24-hour day will be used to calculate
the daily average. Otherwise, only the maximum and minimum observations
are used. Defaults to False (meteorological standard).
"""
monthlyLows = monthly_lows(df, decimals=decimals, true_average=true_average)
monthlyLows.drop('YearDay', axis=1, inplace=True)
avgMonthlyLows = monthlyLows.groupby(monthlyLows.index.month).mean(numeric_only=True).round(decimals)
results = xr.DataArray(avgMonthlyLows, dims=['month', 'variable'])
results.name = 'Average Monthly Low'
return results
def highest_daily_low(df, decimals=1):
"""Highest daily lows rounded to 'decimals'."""
# Calculate the record
dailyLows = daily_lows(df)
highestLow = dailyLows.groupby('YearDay').max(numeric_only=True).round(decimals)
highestLow.index = highestLow.index.astype(int)
# Record years
highestLowYear = dailyLows.groupby('YearDay').apply(lambda x: x.idxmax(numeric_only=True).dt.year)
highestLowYear.drop('YearDay', axis=1, inplace=True)
highestLowYear.index = highestLowYear.index.astype(int)
highestLowYear.columns = [i+' Year' for i in highestLowYear.columns]
# Create xarray
results = pd.concat((highestLow, highestLowYear), axis=1)
results = xr.DataArray(results, dims=['yearday', 'variable'])
results.name = 'Highest Daily Low'
return results
def highest_monthly_low(df, decimals=1, true_average=False):
"""Highest monthly lows rounded to 'decimals'. If 'true_average' is
True, all measurements from each 24-hour day will be used to calculate
the daily average. Otherwise, only the maximum and minimum observations
are used. Defaults to False (meteorological standard).
"""
# Calculate the record
monthlyLows = monthly_lows(df, decimals=decimals, true_average=true_average)
monthlyLows.drop('YearDay', axis=1, inplace=True)
highestMonthlyLow = monthlyLows.groupby(monthlyLows.index.month).max(numeric_only=True).round(decimals)
highestMonthlyLow.index = highestMonthlyLow.index.astype(int)
# Record years
highestMonthlyLowYear = monthlyLows.groupby(monthlyLows.index.month).apply(lambda x: x.idxmax(numeric_only=True).dt.year)
highestMonthlyLowYear.index = highestMonthlyLowYear.index.astype(int)
highestMonthlyLowYear.columns = [i+' Year' for i in highestMonthlyLowYear.columns]
# Create xarray
results = pd.concat((highestMonthlyLow, highestMonthlyLowYear), axis=1)
results = xr.DataArray(results, dims=['month', 'variable'])
results.name = 'Highest Monthly Low'
return results
def record_daily_low(df, decimals=1):
"""Record daily lows rounded to 'decimals'."""
# Calculate the record
dailyLows = daily_lows(df)
recordLow = dailyLows.groupby('YearDay').min(numeric_only=True).round(decimals)
recordLow.index = recordLow.index.astype(int)
# Record years
recordLowYear = dailyLows.groupby('YearDay').apply(lambda x: x.idxmin(numeric_only=True).dt.year)
recordLowYear.drop('YearDay', axis=1, inplace=True)
recordLowYear.index = recordLowYear.index.astype(int)
recordLowYear.columns = [i+' Year' for i in recordLowYear.columns]
# Create xarray
results = pd.concat((recordLow, recordLowYear), axis=1)
results = xr.DataArray(results, dims=['yearday', 'variable'])
results.name = 'Record Daily Low'
return results
def record_monthly_low(df, decimals=1, true_average=False):
"""Record monthly lows rounded to 'decimals'. If 'true_average' is
True, all measurements from each 24-hour day will be used to calculate
the daily average. Otherwise, only the maximum and minimum observations
are used. Defaults to False (meteorological standard).
"""
# Calculate the record
monthlyLows = monthly_lows(df, decimals=decimals, true_average=true_average)
monthlyLows.drop('YearDay', axis=1, inplace=True)
recordMonthlyLow = monthlyLows.groupby(monthlyLows.index.month).min(numeric_only=True).round(decimals)
recordMonthlyLow.index = recordMonthlyLow.index.astype(int)
# Record years
recordMonthlyLowYear = monthlyLows.groupby(monthlyLows.index.month).apply(lambda x: x.idxmin(numeric_only=True).dt.year)
recordMonthlyLowYear.index = recordMonthlyLowYear.index.astype(int)
recordMonthlyLowYear.columns = [i+' Year' for i in recordMonthlyLowYear.columns]
# Create xarray
results = pd.concat((recordMonthlyLow, recordMonthlyLowYear), axis=1)
results = xr.DataArray(results, dims=['month', 'variable'])
results.name = 'Record Monthly Low'
return results
def number_of_years_byday(df):
"""Number of years in the historical data records by day of year."""
numYears = pd.concat([df[[v, 'YearDay']].dropna().groupby('YearDay')\
.apply(lambda x: len(x.index.year.unique())) \
for v in filtered_data.columns if v != 'YearDay'], axis=1)
numYears.columns = [v for v in df.columns if v != 'YearDay']
results = xr.DataArray(numYears, dims=['yearday', 'variable'])
results.name = 'Number of Years'
return results
def number_of_years_bymonth(df):
"""Number of years in the historical data records by month."""
numYears = pd.concat([df[v].dropna().groupby(df[v].dropna().index.month)\
.apply(lambda x: len(x.index.year.unique())) \
for v in filtered_data.columns if v != 'YearDay'], axis=1)
numYears.columns = [v for v in df.columns if v != 'YearDay']
results = xr.DataArray(numYears, dims=['month', 'variable'])
results.name = 'Number of Years'
return results
def generate_yeardays():
return pd.date_range(start='2020-01-01',end='2020-12-31', freq='1D').strftime('%d-%b')Data cleaning
First we need to load in the data and metadata for the desired station. This will be used to determine the directory from which to load the data.
As before, stationname is the custom human-readable “City, ST” string for the station. Since we are not downloading data, we do not need the NOAA-COOPS station ID number.
stationname = 'Virginia Key, FL'Derive the local directory name containing for data from the station name. This is the same way the directory was created when the data were downloaded.
dirname = camel(stationname)
outdir = os.path.join(os.getcwd(), dirname)
print(f"Station folder: {dirname}")
print(f"Full directory: {outdir}")Station folder: virginiaKeyFl
Full directory: /home/climatology/virginiaKeyFl
Next, load the data and metadata.
# Metadata
with open(os.path.join(outdir, 'metadata.yml')) as m:
meta = yaml.safe_load(m)
# Observational data
data = pd.read_csv(os.path.join(outdir, 'observational_data_record.csv.gz'),
index_col=f'time_{meta["tz"]}', parse_dates=True,
compression='infer')Now we filter the data to remove days with more than 3 hours of missing data and months with more than 2 days of missing data. These thresholds are stored in meta and can easily be changed. We have to do this one variable at a time because this is sensor-dependent, so it takes a short while to run.
filtered_data = pd.concat([filter_data(data[var],
hr_threshold=meta['hr_threshold'],
day_threshold=meta['day_threshold'])
for var in meta['variables']], axis=1)Confirm that the data were filtered:
data.shape(2466580, 2)
filtered_data.shape(2174766, 2)
Calculate records
Now we’re ready to determine the records using all of the functions above. We’ll store these in an xarray dataset and add the appropriate metadata for convenience. But first, we need to add a day of year (DOY) column so that we can calculate daily records. We’ve used a function to do this because accounting for leap years is not trivial.
filtered_data = DOY(filtered_data)daily_records = \
xr.Dataset({'Daily Average': daily_avg(filtered_data),
'Record High Daily Average': record_high_daily_avg(filtered_data),
'Record Low Daily Average': record_low_daily_avg(filtered_data),
'Average High': avg_daily_high(filtered_data),
'Lowest High': lowest_daily_high(filtered_data),
'Record High': record_daily_high(filtered_data),
'Average Low': avg_daily_low(filtered_data),
'Highest Low': highest_daily_low(filtered_data),
'Record Low': record_daily_low(filtered_data),
'Years': number_of_years_byday(filtered_data)},
attrs={k:v for k, v in meta.items() if k not in ['outdir', 'variables', 'units']})monthly_records = \
xr.Dataset({'Monthly Average': monthly_avg(filtered_data),
'Record High Monthly Average': record_high_monthly_avg(filtered_data),
'Record Low Monthly Average': record_low_monthly_avg(filtered_data),
'Average High': avg_monthly_high(filtered_data),
'Lowest High': lowest_monthly_high(filtered_data),
'Record High': record_monthly_high(filtered_data),
'Average Low': avg_monthly_low(filtered_data),
'Highest Low': highest_monthly_low(filtered_data),
'Record Low': record_monthly_low(filtered_data),
'Years': number_of_years_bymonth(filtered_data)},
attrs={k:v for k, v in meta.items() if k not in ['outdir', 'variables', 'units']})Add data units and time series ranges for each variable to the arrays as metadata attributes.
for k, v in meta['units'].items():
daily_records.attrs[k+' units'] = v
for var in daily_records.coords['variable'].values:
if 'Year' not in var:
daily_records.attrs[var+' data range'] = \
(filtered_data[var].dropna().index.min().strftime('%Y-%m-%d'),
filtered_data[var].dropna().index.max().strftime('%Y-%m-%d'))for k, v in meta['units'].items():
monthly_records.attrs[k+' units'] = v
for var in monthly_records.coords['variable'].values:
if 'Year' not in var:
monthly_records.attrs[var+' data range'] = \
(filtered_data[var].dropna().index.min().strftime('%Y-%m-%d'),
filtered_data[var].dropna().index.max().strftime('%Y-%m-%d'))What do we have now? Let’s take a look:
daily_records<xarray.Dataset> Size: 120kB
Dimensions: (yearday: 366, variable: 4)
Coordinates:
* yearday (yearday) int64 3kB 1 2 3 4 5 ... 363 364 365 366
* variable (variable) object 32B 'Air Temperature' ... 'W...
Data variables:
Daily Average (yearday, variable) float64 12kB 71.5 nan ... nan
Record High Daily Average (yearday, variable) float64 12kB 78.0 ... 2.02...
Record Low Daily Average (yearday, variable) float64 12kB 54.4 ... 2.01...
Average High (yearday, variable) float64 12kB 75.0 nan ... nan
Lowest High (yearday, variable) float64 12kB 63.3 ... 2.01...
Record High (yearday, variable) float64 12kB 79.3 ... 2.02...
Average Low (yearday, variable) float64 12kB 67.9 nan ... nan
Highest Low (yearday, variable) float64 12kB 76.8 ... 2.02...
Record Low (yearday, variable) float64 12kB 45.5 ... 2.01...
Years (yearday, variable) float64 12kB 23.0 nan ... nan
Attributes:
datum: MHHW
day_threshold: 2
hr_threshold: 3
last_updated: 2024-05-25 10:00:00
stationid: 8723214
stationname: Virginia Key, FL
tz: lst
unit_system: english
Air Temperature units: F
Water Temperature units: F
Air Temperature data range: ('1994-04-01', '2024-04-30')
Water Temperature data range: ('1994-04-01', '2024-04-30')monthly_records<xarray.Dataset> Size: 4kB
Dimensions: (month: 12, variable: 4)
Coordinates:
* month (month) int64 96B 1 2 3 4 5 6 7 8 9 10 11 12
* variable (variable) object 32B 'Air Temperature' ... ...
Data variables:
Monthly Average (month, variable) float64 384B 68.7 nan ... nan
Record High Monthly Average (month, variable) float64 384B 72.6 ... 2.01...
Record Low Monthly Average (month, variable) float64 384B 63.0 ... 2.01...
Average High (month, variable) float64 384B 76.0 nan ... nan
Lowest High (month, variable) float64 384B 73.0 ... 2.00...
Record High (month, variable) float64 384B 78.0 ... 2.01...
Average Low (month, variable) float64 384B 55.6 nan ... nan
Highest Low (month, variable) float64 384B 63.5 ... 2.01...
Record Low (month, variable) float64 384B 48.3 ... 2.01...
Years (month, variable) float64 384B 23.0 nan ... nan
Attributes:
datum: MHHW
day_threshold: 2
hr_threshold: 3
last_updated: 2024-05-25 10:00:00
stationid: 8723214
stationname: Virginia Key, FL
tz: lst
unit_system: english
Air Temperature units: F
Water Temperature units: F
Air Temperature data range: ('1994-04-01', '2024-04-30')
Water Temperature data range: ('1994-04-01', '2024-04-30')How are these stored? Let’s consider the monthly stats. Each statistic is its own variable within the dataset. Take Record High for example:
monthly_records['Record High']<xarray.DataArray 'Record High' (month: 12, variable: 4)> Size: 384B
array([[ 78. , 2015. , 81.7, 2017. ],
[ 78.6, 2021. , 81.5, 2021. ],
[ 82.8, 2003. , 83.2, 2021. ],
[ 85.8, 2020. , 85.8, 2020. ],
[ 85.2, 1995. , 87.7, 2021. ],
[ 87.6, 2009. , 90.4, 2010. ],
[ 88.7, 2018. , 92. , 2021. ],
[ 88.5, 2022. , 92.2, 2021. ],
[ 86.7, 2021. , 91.2, 2021. ],
[ 86.8, 2023. , 89. , 2016. ],
[ 82. , 2020. , 85. , 2020. ],
[ 79.6, 1994. , 84.6, 2016. ]])
Coordinates:
* month (month) int64 96B 1 2 3 4 5 6 7 8 9 10 11 12
* variable (variable) object 32B 'Air Temperature' ... 'Water Temperature ...Here, the rows are months and the columns are the records or corresponding year. Let’s see what the variables are:
monthly_records['Record High'].coords['variable']<xarray.DataArray 'variable' (variable: 4)> Size: 32B
array(['Air Temperature', 'Air Temperature Year', 'Water Temperature',
'Water Temperature Year'], dtype=object)
Coordinates:
* variable (variable) object 32B 'Air Temperature' ... 'Water Temperature ...Alternatively, we can select a specific variable and see all of its stats (converting to a dataframe makes it easier to see):
monthly_records.sel(variable='Air Temperature').to_dataframe().drop('variable', axis=1)| Monthly Average | Record High Monthly Average | Record Low Monthly Average | Average High | Lowest High | Record High | Average Low | Highest Low | Record Low | Years | |
|---|---|---|---|---|---|---|---|---|---|---|
| month | ||||||||||
| 1 | 68.7 | 72.6 | 63.0 | 76.0 | 73.0 | 78.0 | 55.6 | 63.5 | 48.3 | 23.0 |
| 2 | 70.8 | 74.9 | 65.5 | 76.5 | 74.2 | 78.6 | 59.4 | 70.0 | 47.9 | 23.0 |
| 3 | 72.3 | 77.6 | 66.1 | 78.5 | 74.2 | 82.8 | 63.3 | 72.0 | 55.1 | 24.0 |
| 4 | 75.6 | 79.4 | 72.8 | 80.8 | 77.3 | 85.8 | 68.3 | 72.6 | 61.2 | 24.0 |
| 5 | 78.7 | 80.7 | 77.0 | 82.5 | 80.8 | 85.2 | 73.8 | 77.1 | 67.9 | 21.0 |
| 6 | 81.5 | 83.6 | 79.8 | 84.8 | 82.8 | 87.6 | 77.6 | 80.8 | 75.1 | 20.0 |
| 7 | 82.9 | 85.0 | 81.0 | 85.8 | 84.2 | 88.7 | 79.0 | 82.3 | 76.1 | 25.0 |
| 8 | 83.2 | 85.9 | 81.8 | 85.7 | 84.0 | 88.5 | 79.3 | 83.6 | 76.1 | 24.0 |
| 9 | 82.0 | 82.7 | 80.6 | 85.1 | 83.9 | 86.7 | 78.2 | 79.8 | 74.3 | 24.0 |
| 10 | 79.6 | 81.2 | 77.5 | 83.8 | 81.0 | 86.8 | 72.7 | 77.8 | 64.6 | 23.0 |
| 11 | 75.0 | 78.6 | 71.4 | 79.7 | 76.9 | 82.0 | 66.0 | 74.4 | 57.4 | 23.0 |
| 12 | 71.4 | 76.9 | 62.1 | 77.5 | 72.5 | 79.6 | 59.2 | 70.5 | 48.8 | 24.0 |
Reorganize
For the sake of convenience later, let’s rearrange these data arrays before saving them. It will be more useful to have record years as data variables instead of a dimension, but we’ll have to do some renaming in order to pull that off.
First, separate the records and years into smaller xarrays:
day_records = daily_records.sel(variable=[i for i in daily_records.coords['variable'].values if 'Year' not in i])
day_years = daily_records.sel(variable=[i for i in daily_records.coords['variable'].values if 'Year' in i])
mon_records = monthly_records.sel(variable=[i for i in monthly_records.coords['variable'].values if 'Year' not in i])
mon_years = monthly_records.sel(variable=[i for i in monthly_records.coords['variable'].values if 'Year' in i])Next, add “Year” to all of the variable names and remove it from the coordinate name:
day_years = day_years.rename_vars({i:i+' Year' for i in day_years.data_vars})
day_years.coords['variable'] = [i.removesuffix(' Year') for i in day_years.coords['variable'].values]
mon_years = mon_years.rename_vars({i:i+' Year' for i in mon_years.data_vars})
mon_years.coords['variable'] = [i.removesuffix(' Year') for i in mon_years.coords['variable'].values]Now we can merge these two xarrays together, rearrange the order of the variables, and drop those that do not contain a year, such as daily average.
daily_records = xr.merge([day_records, day_years])
daily_records = daily_records[[item for items in zip(day_records.data_vars, day_years.data_vars) for item in items]]
daily_records = daily_records.drop_vars([x for x in daily_records.data_vars if daily_records[x].isnull().all()])
monthly_records = xr.merge([mon_records, mon_years])
monthly_records = monthly_records[[item for items in zip(mon_records.data_vars, mon_years.data_vars) for item in items]]
monthly_records = monthly_records.drop_vars([x for x in monthly_records.data_vars if monthly_records[x].isnull().all()])monthly_records<xarray.Dataset> Size: 3kB
Dimensions: (month: 12, variable: 2)
Coordinates:
* month (month) int64 96B 1 2 3 4 5 ... 9 10 11 12
* variable (variable) object 16B 'Air Temperature'...
Data variables: (12/16)
Monthly Average (month, variable) float64 192B 68.7 ......
Record High Monthly Average (month, variable) float64 192B 72.6 ......
Record High Monthly Average Year (month, variable) float64 192B 2.013e+0...
Record Low Monthly Average (month, variable) float64 192B 63.0 ......
Record Low Monthly Average Year (month, variable) float64 192B 2.001e+0...
Average High (month, variable) float64 192B 76.0 ......
... ...
Average Low (month, variable) float64 192B 55.6 ......
Highest Low (month, variable) float64 192B 63.5 ......
Highest Low Year (month, variable) float64 192B 2.013e+0...
Record Low (month, variable) float64 192B 48.3 ......
Record Low Year (month, variable) float64 192B 1.997e+0...
Years (month, variable) float64 192B 23.0 ......
Attributes:
datum: MHHW
day_threshold: 2
hr_threshold: 3
last_updated: 2024-05-25 10:00:00
stationid: 8723214
stationname: Virginia Key, FL
tz: lst
unit_system: english
Air Temperature units: F
Water Temperature units: F
Air Temperature data range: ('1994-04-01', '2024-04-30')
Water Temperature data range: ('1994-04-01', '2024-04-30')Finally, let’s convert years to integers since we do not need decimal years.
daily_records[[i for i in daily_records.data_vars if "Year" in i]] = \
daily_records[[i for i in daily_records.data_vars if "Year" in i]].astype(int)
monthly_records[[i for i in monthly_records.data_vars if "Year" in i]] = \
monthly_records[[i for i in monthly_records.data_vars if "Year" in i]].astype(int)‘yearday’ is not intuitive, so we can change it to calendar day instead and rename the coordinate. Similarly, we can use month names instead of numbers for the sake of clarity.
daily_records.coords['yearday'] = pd.date_range(start='2020-01-01', end='2020-12-31', freq='1D').strftime('%d-%b')
daily_records = daily_records.rename({'yearday':'Date'})
monthly_records.coords['month'] = pd.date_range(start='2020-01-01', end='2020-12-31', freq='1m').strftime('%b')
monthly_records = monthly_records.rename({'month': 'Month'})Now take a look at the final products
daily_records<xarray.Dataset> Size: 97kB
Dimensions: (Date: 366, variable: 2)
Coordinates:
* variable (variable) object 16B 'Air Temperature' '...
* Date (Date) object 3kB '01-Jan' ... '31-Dec'
Data variables: (12/16)
Daily Average (Date, variable) float64 6kB 71.5 ... 72.6
Record High Daily Average (Date, variable) float64 6kB 78.0 ... 80.5
Record High Daily Average Year (Date, variable) int64 6kB 2022 ... 2021
Record Low Daily Average (Date, variable) float64 6kB 54.4 ... 66.1
Record Low Daily Average Year (Date, variable) int64 6kB 2001 ... 2010
Average High (Date, variable) float64 6kB 75.0 ... 73.9
... ...
Average Low (Date, variable) float64 6kB 67.9 ... 71.4
Highest Low (Date, variable) float64 6kB 76.8 ... 79.3
Highest Low Year (Date, variable) int64 6kB 2022 ... 2021
Record Low (Date, variable) float64 6kB 45.5 ... 64.4
Record Low Year (Date, variable) int64 6kB 2001 ... 2010
Years (Date, variable) int64 6kB 23 24 ... 23 24
Attributes:
datum: MHHW
day_threshold: 2
hr_threshold: 3
last_updated: 2024-05-25 10:00:00
stationid: 8723214
stationname: Virginia Key, FL
tz: lst
unit_system: english
Air Temperature units: F
Water Temperature units: F
Air Temperature data range: ('1994-04-01', '2024-04-30')
Water Temperature data range: ('1994-04-01', '2024-04-30')monthly_records<xarray.Dataset> Size: 3kB
Dimensions: (Month: 12, variable: 2)
Coordinates:
* variable (variable) object 16B 'Air Temperature'...
* Month (Month) object 96B 'Jan' 'Feb' ... 'Dec'
Data variables: (12/16)
Monthly Average (Month, variable) float64 192B 68.7 ......
Record High Monthly Average (Month, variable) float64 192B 72.6 ......
Record High Monthly Average Year (Month, variable) int64 192B 2013 ... 2016
Record Low Monthly Average (Month, variable) float64 192B 63.0 ......
Record Low Monthly Average Year (Month, variable) int64 192B 2001 ... 2010
Average High (Month, variable) float64 192B 76.0 ......
... ...
Average Low (Month, variable) float64 192B 55.6 ......
Highest Low (Month, variable) float64 192B 63.5 ......
Highest Low Year (Month, variable) int64 192B 2013 ... 2016
Record Low (Month, variable) float64 192B 48.3 ......
Record Low Year (Month, variable) int64 192B 1997 ... 2010
Years (Month, variable) int64 192B 23 24 ... 24
Attributes:
datum: MHHW
day_threshold: 2
hr_threshold: 3
last_updated: 2024-05-25 10:00:00
stationid: 8723214
stationname: Virginia Key, FL
tz: lst
unit_system: english
Air Temperature units: F
Water Temperature units: F
Air Temperature data range: ('1994-04-01', '2024-04-30')
Water Temperature data range: ('1994-04-01', '2024-04-30')monthly_records.coords['variable']<xarray.DataArray 'variable' (variable: 2)> Size: 16B array(['Air Temperature', 'Water Temperature'], dtype=object) Coordinates: * variable (variable) object 16B 'Air Temperature' 'Water Temperature'
We can still choose one environmental variable at a time, but now we get all of the records and corresponding years:
monthly_records.sel(variable='Air Temperature').to_dataframe().drop('variable', axis=1)| Monthly Average | Record High Monthly Average | Record High Monthly Average Year | Record Low Monthly Average | Record Low Monthly Average Year | Average High | Lowest High | Lowest High Year | Record High | Record High Year | Average Low | Highest Low | Highest Low Year | Record Low | Record Low Year | Years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Month | ||||||||||||||||
| Jan | 68.7 | 72.6 | 2013 | 63.0 | 2001 | 76.0 | 73.0 | 2011 | 78.0 | 2015 | 55.6 | 63.5 | 2013 | 48.3 | 1997 | 23 |
| Feb | 70.8 | 74.9 | 2018 | 65.5 | 1996 | 76.5 | 74.2 | 2000 | 78.6 | 2021 | 59.4 | 70.0 | 2018 | 47.9 | 1996 | 23 |
| Mar | 72.3 | 77.6 | 2003 | 66.1 | 2010 | 78.5 | 74.2 | 2010 | 82.8 | 2003 | 63.3 | 72.0 | 1997 | 55.1 | 1996 | 24 |
| Apr | 75.6 | 79.4 | 2020 | 72.8 | 2004 | 80.8 | 77.3 | 2004 | 85.8 | 2020 | 68.3 | 72.6 | 2015 | 61.2 | 2009 | 24 |
| May | 78.7 | 80.7 | 1995 | 77.0 | 2013 | 82.5 | 80.8 | 2014 | 85.2 | 1995 | 73.8 | 77.1 | 2003 | 67.9 | 1999 | 21 |
| Jun | 81.5 | 83.6 | 2010 | 79.8 | 2014 | 84.8 | 82.8 | 2014 | 87.6 | 2009 | 77.6 | 80.8 | 2004 | 75.1 | 1995 | 20 |
| Jul | 82.9 | 85.0 | 2023 | 81.0 | 2013 | 85.8 | 84.2 | 2012 | 88.7 | 2018 | 79.0 | 82.3 | 2022 | 76.1 | 2013 | 25 |
| Aug | 83.2 | 85.9 | 2022 | 81.8 | 1994 | 85.7 | 84.0 | 2003 | 88.5 | 2022 | 79.3 | 83.6 | 2022 | 76.1 | 1996 | 24 |
| Sep | 82.0 | 82.7 | 2017 | 80.6 | 2001 | 85.1 | 83.9 | 2000 | 86.7 | 2021 | 78.2 | 79.8 | 2009 | 74.3 | 2001 | 24 |
| Oct | 79.6 | 81.2 | 2020 | 77.5 | 2000 | 83.8 | 81.0 | 2010 | 86.8 | 2023 | 72.7 | 77.8 | 1995 | 64.6 | 2005 | 23 |
| Nov | 75.0 | 78.6 | 2015 | 71.4 | 2012 | 79.7 | 76.9 | 2012 | 82.0 | 2020 | 66.0 | 74.4 | 2020 | 57.4 | 2006 | 23 |
| Dec | 71.4 | 76.9 | 2015 | 62.1 | 2010 | 77.5 | 72.5 | 2010 | 79.6 | 1994 | 59.2 | 70.5 | 2015 | 48.8 | 2010 | 24 |
Finally, write these to file for safe keeping.
daily_records.to_netcdf(os.path.join(outdir, 'statistics-daily.nc'), mode='w')
monthly_records.to_netcdf(os.path.join(outdir, 'statistics-monthly.nc'), mode='w')<frozen importlib._bootstrap>:241: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility. Expected 16 from C header, got 96 from PyObject
We will plot these results in Part 3, NOAA-CO-OPS-plots.